library(usethis)
library(dplyr)
library(tidytuesdayR)
library(tidyverse)
#library(cancerprof) - need to put in directly from github but not reloaded automatically so may need new codeblock for reproducibility
library(tidycensus)
library(broom)
library(plotly)
library(gt)
library(gtsummary)
library(ggplot2)
library(janitor)
library(sf)
library(tigris)
library(naniar)
#library(usmap) # for simple state mapping (need above)
library(ggthemes) # optional, for prettier maps (need above
options(tigris_use_cache = TRUE)## ---- Compiling #TidyTuesday Information for 2025-04-08 ----
## --- There is 1 file available ---
##
##
## ── Downloading files ───────────────────────────────────────────────────────────
##
## 1 of 1: "care_state.csv"
#we need to group the averages versus the percentages
#There's Sepsis and Opioid prescription measures which are percentages which are not labeled as percentages
# conditions are the themes for the types of measures
hosp_dat3 <- hosp_dat2 %>%
mutate(typenum = case_when(
str_detect(measure_name, '[Aa]verage') ~ 'avg',
str_detect(measure_name, '[Pp]ercentage|Safe\\sUse|Septic\\sShock|Severe\\sSepsis') ~ 'pct',
TRUE ~ NA
))
#Explore if anything has time for different measures
hosp_dat3 %>%
mutate(time_var = time_length(measure_len, unit='days')) %>%
group_by(measure_name) %>%
summarise(min_time = min(time_var, na.rm = TRUE),
max_time = max(time_var, na.rm = TRUE)) %>%
filter(max_time!=min_time)
#alternatively:
hosp_dat3 %>%
mutate(time_var = time_length(measure_len, unit='days')) %>%
group_by(measure_name) %>%
reframe(ran_var = range(time_var, na.rm=TRUE)) %>%
print(n=42)
#So regardless of the measure the duration of the measure is the same
#There is some hard to interpret measures such as avg time spent in ED before being sent home with different 'levels' in parentheses but there's not really an explanation for this
hosp_dat3 %>%
filter(condition=='Emergency Department' & typenum=='avg') %>%
group_by(measure_name) %>%
summarise(median_scr = median(score, na.rm = TRUE))
hosp_dat3 %>%
filter(str_detect(measure_id, '^OP_?.*MIN?.*')) %>%
group_by(measure_id) %>%
reframe(min_score = min(score, na.rm=TRUE),
max_score = max(score, na.rm = TRUE))
#it might be good just to use the 'raw' score of OP_18b and OP_18c because these categories are hard to discern
#The categories of LOW, MEDIUM, HIGH, and VERY HIGH are a bit hard to understand in these data. Those with very low scores don't have a 'very high' but those with the maximum score for 18b of 310 don't have a
#18b and 18c seem to contain others
#Sep_1 may contain Sep_SH_3HR; sep_SH_6hr; SEV_sep_3hr; SEV_SEP_6HR
hosp_dat3 %>%
filter(str_detect(measure_id, '.*SEP_?.*')) %>%
select(state, condition, measure_id, score) %>%
pivot_wider(id_cols = state, names_from = measure_id, values_from=score)#2020 state population data
cen_totpop <- tidycensus::get_decennial('state', year=2020, variables = 'P1_001N')
# income and poverty levels
vars <- c(
median_inc = "B19013_001",
per_capita_inc = "B19301_001",
gini_ind = "B19083_001",
poverty_ct = "B17001_002",
poverty_tot = "B17001_001"
)
#Note that the variables ending in 'E' are Estimates and the variables ending in 'M' are 'Margins of Error' when selecting 'wide'
acs_incdat <- get_acs(
geography = "state",
variables = vars,
survey = "acs5",
year = 2022,
output = "wide"
)
#land area of the states
state_area_data <- readr::read_tsv('2024_Gaz_state_national.txt')
#Overdose data in the states?
ovrdose_st <- read_csv('drug_mortality_bystate_2022.csv')
ovrdose_st2 <- ovrdose_st %>%
filter(YEAR==2022) %>%
mutate(STATE = case_when(
STATE=='District of Columbia' ~ 'DC',
TRUE ~ STATE
)) %>%
select(-c(URL)) %>%
rename(Rate_per_100k = RATE,
Year_dat = YEAR,
Overdose_deaths = DEATHS,
state = STATE)#SEP_SH_3HR vs. SEP_SH_6HR; SEV_SEP_3HR vs. SEV_SEP_6HR
#3hr vs. 6 hr bundles for sepsis
hosp_dat3 %>%
filter(str_detect(measure_id, '.*SEP_?.*')) %>%
select(state, condition, measure_id, score) %>%
pivot_wider(id_cols = state, names_from = measure_id, values_from=score) %>%
mutate(thrvssix_shock = (SEP_SH_6HR-SEP_SH_3HR)/SEP_SH_3HR,
thrvssix_svr = (SEV_SEP_6HR - SEV_SEP_3HR)/SEV_SEP_3HR
) ## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_line()`).
First states or variables without informative data, enough data or no data are dropped from these analyses. Many of the territories had no data. Additionally OP_31 was only present for 12 states.
The data consists of a mix of higher is better and lower is better, as well as absolute and percentage scores. Here, the ‘lower is better scores’, OP_22, SAFE_USE_OF_OPIOIDS, OP_18b, and OP18c are flipped.
# Step 3: Rank all numeric variables (but keep the ID variable as it is)
hosp_ranked <- hospdat_flipped %>%
mutate(across(where(is.numeric), ~rank(-., ties.method = "min"))) #giving lower ranks to higher scores and handling ties as the minimum possible rank and same values
# Step 4: Extract numeric data (for PCA) and drop the ID variable (State)
ranked_numeric <- hosp_ranked %>%
select(-all_of(id_var)) #syntax is a bit weird for one variable but needed since I'm calling an external vector
# Step 5: Compute Spearman correlation matrix
cor_matrix <- cor(ranked_numeric, method = "spearman")
# Step 6: Perform PCA
pca_result <- prcomp(cor_matrix, center = TRUE, scale. = TRUE)# Step 7a: View eigenvalues and % variance explained
eigenvalues <- pca_result$sdev^2
prop_variance <- eigenvalues / sum(eigenvalues)
cum_variance <- cumsum(prop_variance)
variance_explained <- data.frame(
PC = paste0("PC", 1:length(eigenvalues)),
Eigenvalue = eigenvalues,
Proportion = round(prop_variance, 3),
Cumulative = round(cum_variance, 3)
)
print(variance_explained)## PC Eigenvalue Proportion Cumulative
## 1 PC1 3.638907e+00 0.455 0.455
## 2 PC2 1.811141e+00 0.226 0.681
## 3 PC3 1.112091e+00 0.139 0.820
## 4 PC4 6.627018e-01 0.083 0.903
## 5 PC5 5.165507e-01 0.065 0.968
## 6 PC6 2.552873e-01 0.032 1.000
## 7 PC7 3.321550e-03 0.000 1.000
## 8 PC8 1.262481e-33 0.000 1.000
#Visualization for these total contributions of variables
total_cont <- ranked_contrib %>%
tibble(
variable = names(.), # The names of the vector are the variable names
contribution = .)
ggplot(total_cont, aes(x = reorder(variable, contribution), y = contribution)) +
geom_bar(stat = "identity", fill = "skyblue") +
coord_flip() +
labs(title = "Variable Contributions to PCA (PC1, PC2, PC3)",
x = "Variable", y = "Contribution") +
theme_minimal()# Optional: Plot a biplot of the PCA result
biplot(pca_result, scale = 0, main = "PCA on Ranked and Transformed Variables")For this analysis both Safe prescribing of Opioids and general care are most important. OP_29 represents the percentage of patients receiving appropriate recommendation for follow-up screening of a colonoscopy. OP_23, percentage of patients with stroke symptoms presenting at the ED who received brain scan results within 45 minutes and SEP_1, percentage of patients receiving apprioriate care for sepsis and septic shock, nextcontribute to these PCs.
The biplot of PC1 and PC2 shows similar contributions for OP_18b and OP_18c to PC1, time in emergency department for patients, and time in emergency department for pyschiatric patients. This is opposed or negatively correlated to OP_29. This may hint towards differences between Emergency Department efficiency and general practice care for screenings. What is interesting is the also the PC2 results, with Sepsis care being negatively coorelated to Safe Opioid prescribing. This may represent differences in pharmacovigilance, and pain management vs. quality of care for infectious and life-threatening disease, along PC2.
Again Eigenvalues are examined as well as a scree plot These results have more PCs, here 4 are chosen for ~77% of the variance of the data
## PC Eigenvalue Proportion Cumulative
## 1 PC1 2.54053571 0.318 0.318
## 2 PC2 1.50511484 0.188 0.506
## 3 PC3 1.13861459 0.142 0.648
## 4 PC4 0.97138875 0.121 0.769
## 5 PC5 0.70882473 0.089 0.858
## 6 PC6 0.58330207 0.073 0.931
## 7 PC7 0.48519686 0.061 0.992
## 8 PC8 0.06702245 0.008 1.000
# Step 8: Rank variable contributions to the first 4 PCs for the state data (optional)
loadings2 <- pca_scaled_states$rotation
contrib_pc1b <- loadings2[, 1]^2
contrib_pc2b <- loadings2[, 2]^2
contrib_pc3b <- loadings2[, 3]^2
contrib_pc4b <- loadings2[, 4]^2
total_contrib2 <- contrib_pc1b + contrib_pc2b + contrib_pc3b + contrib_pc4b
# Rank by total contribution
ranked_contrib2 <- sort(total_contrib2, decreasing = TRUE)
#print(ranked_contrib2)
#not sure this is necessary because does total contributions for variables with state groupings analysis
total_cont2 <- ranked_contrib2 %>%
tibble(
variable = names(.), # The names of the vector are the variable names
contribution = .)
print(total_contrib2)## IMM_3 OP_18b OP_18c OP_22
## 0.4544104 0.4051159 0.3902834 0.3129296
## OP_23 OP_29 SAFE_USE_OF_OPIOIDS SEP_1
## 0.8942038 0.6521496 0.5384184 0.3524891
# Extracting the state scores (projections) onto the PCs
state_names <- hosp_ranked[[id_var]] # e.g., state abbreviations
# Get the PCA scores (these are your transformed coordinates in PC space)
state_scores <- as.data.frame(pca_scaled_states$x[, 1:4]) # taking the first 4 PCs as Eigen value ~1 or more ; explains 77% of the variance
state_scores$state <- state_names
set.seed(555) # set seed to reproduce results
kmeans_result <- kmeans(state_scores[, 1:4], centers = 3, nstart = 25)
state_scores$cluster <- kmeans_result$cluster
# Aggregating the 4 PCs contributions to the state clusters:
cluster_agg <- aggregate(. ~ cluster, data = state_scores[, c("PC1", "PC2", "PC3", "PC4", "cluster")], mean)# Join abbreviations and names for help joining with map datasets
statekey <- cbind(state.name, state.abb) %>%
rbind(c('District of Columbia', 'DC')) %>%
as_tibble(.) %>%
rename(
'region' = 'state.name',
'stabb' = 'state.abb'
) %>%
arrange(region) %>%
rbind(c('Puerto Rico', 'PR')) %>%
mutate(region=tolower(region))
#Edited code from ChatGPT with some modification for joins
# Get state geometries
states_formap <- tigris::states(cb = TRUE, resolution = "20m", class = "sf") %>%
st_transform(2163) # Albers Equal Area projection
# Check CRS of the full data
#st_crs(states_formap) # should be EPSG:4269 (NAD83)
# Alaska, explicitly resetting CRS after filtering
alaska <- states_formap %>%
filter(STUSPS == "AK") %>%
# st_set_crs(4269) %>% # Ensure original CRS is set (NAD83)
st_transform(2163) %>% # Transform to Albers projection (EPSG:2163)
st_scale4_crs(0.35) %>% # Scale with CRS preservation
st_shift4_crs(c(700000, - 4900000)) %>% #manually puts just south of CA
rotate_geometry2(., -40) #manually roates the state for inset
# Hawaii
hawaii <- states_formap %>% filter(STUSPS == "HI") %>%
st_transform(2163) %>%
st_shift4_crs(c(5000000, -1450000)) %>% #manual shift for index
rotate_geometry2(., -40)
# Puerto Rico
puerto_rico <- states_formap %>% filter(STUSPS == "PR") %>%
st_transform(2163) %>%
st_shift4_crs(c(-1200000, -30000)) %>% # manual shift for index
rotate_geometry2(., 15)
# # Combine Alaska, Hawaii, and Puerto Rico with the rest of the states
remaining_states <- states_formap %>%
filter(!(STUSPS %in% c("AK", "HI", "PR")))
# Combine the transformed data
states_combined <- bind_rows(remaining_states, alaska, hawaii, puerto_rico)# Plot
map1 <- left_join(states_combined, state_scores, by = c("STUSPS" = "state")) %>%
ggplot(.) +
geom_sf(aes(fill = as.factor(cluster)), color = "white", size = 0.2) +
scale_fill_manual(values = c("1" = "#1b9e77", "2" = "#d95f02", "3" = "#7570b3")) +
labs(title = "U.S. Healthcare Clusters (from tigris map)", fill = "Cluster") +
theme_void() +
scale_fill_manual(values = c("1" = "#1b9e77", "2" = "#d95f02", "3" = "#7570b3"),
labels = c("1: High Acute Care Performance,\n Lower opioid/clinical safety",
"2: Better safety scores & avg ED scores,\n Poorer gen practice",
"3: Prevention-oriented,\n Most ED scores poor")) +
labs(fill = "Cluster Type",
title = "State Healthcare System Clusters*",
subtitle = "Based on PCA + Cluster Analysis of CMS Measures",
caption = '*Includes Puerto Rico') #+
# theme_void() +
# theme(
# Move legend slightly left, outside map area
# legend.position = c(1.1, 0.5), # Shift from right to just a bit left
# Add small margin at top and left
# plot.margin = margin(t = 2, r = 40, b = 2, l = -120)
#)
map1#poverty stats are by state name, so need to reconvert statekey to camel case and join
#state area data already has USPS abbreviations
#cen_totpop has names
#datasets are: acs_incdat, state_area_data, cen_totpop
extra_vars <- cen_totpop %>%
rename(pop2020 = value) %>%
select(-c(variable)) %>%
right_join(., acs_incdat, by = c('GEOID', 'NAME')) %>%
left_join(., state_area_data, by = c('GEOID', 'NAME')) %>%
rename(state = USPS) %>%
relocate(state, .before=everything()) %>%
select(c(state, pop2020, ALAND_SQMI, median_incE, gini_indE)) %>%
mutate(popdensity = pop2020/ALAND_SQMI,
gini_Eflip = 1 - gini_indE) %>% # the lower the GINI score, the more equality so flipping this
select(-c(pop2020, ALAND_SQMI, gini_indE))
# Step 3: Rank all numeric variables (but keep the ID variable as it is)
hosp_ranked2 <- hospdat_flipped %>%
left_join(., extra_vars, by = 'state') %>%
mutate(across(where(is.numeric), ~rank(-., ties.method = "min"))) #giving lower ranks to higher scores and handling ties as the minimum possible rank and same values
# Step 4: Extract numeric data (for PCA) and drop the ID variable (State)
ranked_numeric2 <- hosp_ranked2 %>%
select(-all_of(id_var))## PC Eigenvalue Proportion Cumulative
## 1 PC1 3.44983431 0.314 0.314
## 2 PC2 1.80986477 0.165 0.478
## 3 PC3 1.45988244 0.133 0.611
## 4 PC4 1.07784251 0.098 0.709
## 5 PC5 0.91953672 0.084 0.792
## 6 PC6 0.64451396 0.059 0.851
## 7 PC7 0.53447404 0.049 0.900
## 8 PC8 0.49283058 0.045 0.944
## 9 PC9 0.37090233 0.034 0.978
## 10 PC10 0.17829570 0.016 0.994
## 11 PC11 0.06202264 0.006 1.000
# Step 8: Rank variable contributions to the first 5 PCs for the state data (optional) - later I work on 4 PCs given the EigenValues
loadings2b <- pca_scaled_states2$rotation
contrib_pc1c <- loadings2b[, 1]^2
contrib_pc2c <- loadings2b[, 2]^2
contrib_pc3c <- loadings2b[, 3]^2
contrib_pc4c <- loadings2b[, 4]^2
contrib_pc5c <- loadings2b[, 5]^2
total_contrib2b <- contrib_pc1c + contrib_pc2c + contrib_pc3c + contrib_pc4c + contrib_pc5c
# Rank by total contribution
ranked_contrib2b <- sort(total_contrib2b, decreasing = TRUE)
#print(ranked_contrib2)
#not sure this is necessary because does total contributions for variables with state groupings analysis, this includes PC5
total_cont2b <- ranked_contrib2b %>%
tibble(
variable = names(.), # The names of the vector are the variable names
contribution = .)
print(total_cont2b)## # A tibble: 11 × 2
## variable contribution
## <chr> <dbl>
## 1 OP_23 0.678
## 2 median_incE 0.625
## 3 SAFE_USE_OF_OPIOIDS 0.623
## 4 OP_29 0.547
## 5 SEP_1 0.503
## 6 IMM_3 0.446
## 7 gini_Eflip 0.363
## 8 OP_22 0.343
## 9 popdensity 0.336
## 10 OP_18b 0.276
## 11 OP_18c 0.260
| Loadings for the First 4 PCs | ||||
| Healthcare Variable |
Principal Components
|
|||
|---|---|---|---|---|
| PC1 Loading | PC2 Loading | PC3 Loading | PC4 Loading | |
| IMM_3 | −0.173 | 0.501 | 0.150 | −0.010 |
| OP_18b | 0.501 | 0.013 | −0.008 | 0.142 |
| OP_18c | 0.486 | 0.032 | 0.031 | 0.137 |
| OP_22 | 0.306 | −0.116 | 0.385 | −0.143 |
| OP_23 | −0.097 | −0.073 | 0.510 | 0.634 |
| OP_29 | 0.013 | 0.426 | −0.123 | 0.575 |
| SAFE_USE_OF_OPIOIDS | 0.056 | −0.335 | −0.447 | 0.286 |
| SEP_1 | 0.218 | 0.102 | 0.515 | −0.215 |
| median_incE | −0.175 | 0.448 | −0.078 | −0.243 |
| popdensity | −0.460 | −0.050 | 0.196 | 0.139 |
| gini_Eflip | 0.295 | 0.472 | −0.209 | 0.036 |
| Cluster Summary for Principal Components | ||||
| State Cluster | PC1 Mean | PC2 Mean | PC3 Mean | PC4 Mean |
|---|---|---|---|---|
| 1 | 0.180 | 1.172 | 0.019 | 0.140 |
| 2 | 2.196 | −1.108 | −0.173 | 0.171 |
| 3 | −2.043 | −0.784 | 0.114 | −0.340 |